#!/usr/bin/env python3
"""
Hybrid Search Example for Local Deep Research

This example demonstrates how to combine multiple search sources:
1. Multiple named retrievers for different document types
2. Combining custom retrievers with web search
3. Analyzing and comparing sources from different origins
"""

from typing import List
from langchain_core.retrievers import Document, BaseRetriever
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

from local_deep_research.api import quick_summary, detailed_research
from local_deep_research.api.settings_utils import create_settings_snapshot


class TechnicalDocsRetriever(BaseRetriever):
    """Mock retriever for technical documentation."""

    def get_relevant_documents(self, query: str) -> List[Document]:
        """Return mock technical documents."""
        # In a real scenario, this would search actual technical docs
        return [
            Document(
                page_content=f"Technical specification for {query}: Implementation requires careful consideration of system architecture, performance metrics, and scalability factors.",
                metadata={
                    "source": "tech_docs",
                    "type": "specification",
                    "title": f"Technical Spec: {query}",
                },
            ),
            Document(
                page_content=f"Best practices for {query}: Follow industry standards, implement proper error handling, and ensure comprehensive testing coverage.",
                metadata={
                    "source": "tech_docs",
                    "type": "best_practices",
                    "title": f"Best Practices: {query}",
                },
            ),
        ]

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        """Async version."""
        return self.get_relevant_documents(query)


class BusinessDocsRetriever(BaseRetriever):
    """Mock retriever for business/strategy documents."""

    def get_relevant_documents(self, query: str) -> List[Document]:
        """Return mock business documents."""
        return [
            Document(
                page_content=f"Business implications of {query}: Consider market impact, ROI analysis, and strategic alignment with organizational goals.",
                metadata={
                    "source": "business_docs",
                    "type": "strategy",
                    "title": f"Business Strategy: {query}",
                },
            ),
            Document(
                page_content=f"Cost-benefit analysis for {query}: Initial investment requirements, expected returns, and risk assessment factors.",
                metadata={
                    "source": "business_docs",
                    "type": "analysis",
                    "title": f"Cost Analysis: {query}",
                },
            ),
        ]

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        """Async version."""
        return self.get_relevant_documents(query)


def create_knowledge_base_retriever() -> BaseRetriever:
    """Create a FAISS-based retriever with sample knowledge base documents."""
    documents = [
        Document(
            page_content="Machine learning models require training data, validation strategies, and performance metrics for evaluation.",
            metadata={"source": "ml_knowledge_base", "topic": "ml_basics"},
        ),
        Document(
            page_content="Cloud computing provides scalable infrastructure, reducing capital expenditure and enabling flexible resource allocation.",
            metadata={
                "source": "cloud_knowledge_base",
                "topic": "cloud_benefits",
            },
        ),
        Document(
            page_content="Agile methodology emphasizes iterative development, customer collaboration, and responding to change.",
            metadata={"source": "project_knowledge_base", "topic": "agile"},
        ),
        Document(
            page_content="Data privacy regulations like GDPR require explicit consent, data minimization, and user rights management.",
            metadata={
                "source": "compliance_knowledge_base",
                "topic": "privacy",
            },
        ),
    ]

    # Create embeddings and vector store
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore.as_retriever(search_kwargs={"k": 2})


def demonstrate_multiple_retrievers():
    """Show how to use multiple named retrievers for different document types."""
    print("=" * 70)
    print("MULTIPLE NAMED RETRIEVERS")
    print("=" * 70)
    print("""
Using multiple specialized retrievers:
- Technical documentation retriever
- Business documentation retriever
- Knowledge base retriever
Each provides different perspectives on the same topic.
    """)

    # Create different retrievers
    tech_retriever = TechnicalDocsRetriever()
    business_retriever = BusinessDocsRetriever()
    kb_retriever = create_knowledge_base_retriever()

    # Configure settings
    settings = create_settings_snapshot(
        {
            "search.tool": "auto",  # Will use all provided retrievers
        }
    )

    # Use multiple retrievers in research
    result = quick_summary(
        query="Implementing machine learning in production",
        settings_snapshot=settings,
        retrievers={
            "technical": tech_retriever,
            "business": business_retriever,
            "knowledge_base": kb_retriever,
        },
        search_tool="auto",  # Use all retrievers
        iterations=2,
        questions_per_iteration=2,
        programmatic_mode=True,
    )

    print("\nResearch Summary (first 400 chars):")
    print(result["summary"][:400] + "...")

    # Analyze sources by type
    sources = result.get("sources", [])
    print(f"\nTotal sources found: {len(sources)}")

    # Group sources by retriever
    source_types = {}
    for source in sources:
        if isinstance(source, dict):
            source_type = source.get("metadata", {}).get("source", "unknown")
        else:
            source_type = "other"
        source_types[source_type] = source_types.get(source_type, 0) + 1

    print("\nSources by retriever:")
    for stype, count in source_types.items():
        print(f"  - {stype}: {count} sources")

    return result


def demonstrate_retriever_plus_web():
    """Show how to combine custom retrievers with web search."""
    print("\n" + "=" * 70)
    print("RETRIEVER + WEB SEARCH COMBINATION")
    print("=" * 70)
    print("""
Combining internal knowledge with web search:
- Internal: Custom retriever with proprietary knowledge
- External: Wikipedia for general context
This provides both specific and general information.
    """)

    # Create internal knowledge retriever
    internal_retriever = create_knowledge_base_retriever()

    # Configure settings to use both retriever and web
    settings = create_settings_snapshot(
        {
            "search.tool": "wikipedia",  # Also use Wikipedia
        }
    )

    # Research combining internal and external sources
    result = detailed_research(
        query="Best practices for cloud migration",
        settings_snapshot=settings,
        retrievers={
            "internal_kb": internal_retriever,
        },
        search_tool="wikipedia",  # Also search Wikipedia
        search_strategy="source-based",
        iterations=2,
        questions_per_iteration=3,
        programmatic_mode=True,
    )

    print(f"\nResearch ID: {result['research_id']}")
    print(f"Summary length: {len(result['summary'])} characters")

    # Analyze source distribution
    sources = result.get("sources", [])
    internal_sources = 0
    external_sources = 0

    for source in sources:
        if isinstance(source, dict) and "knowledge_base" in str(source):
            internal_sources += 1
        else:
            external_sources += 1

    print("\nSource distribution:")
    print(f"  - Internal knowledge base: {internal_sources} sources")
    print(f"  - External (Wikipedia): {external_sources} sources")

    # Show how different sources complement each other
    print("\nComplementary insights from hybrid search:")
    print(
        "  - Internal sources provide: Specific procedures, proprietary knowledge"
    )
    print(
        "  - External sources provide: Industry context, general best practices"
    )

    return result


def demonstrate_source_analysis():
    """Show how to analyze and compare sources from different origins."""
    print("\n" + "=" * 70)
    print("SOURCE ANALYSIS AND COMPARISON")
    print("=" * 70)
    print("""
Analyzing source quality and relevance:
- Track source origins
- Compare information consistency
- Identify unique insights from each source type
    """)

    # Create multiple retrievers
    tech_retriever = TechnicalDocsRetriever()
    business_retriever = BusinessDocsRetriever()

    settings = create_settings_snapshot(
        {
            "search.tool": "wikipedia",
        }
    )

    # Run research with detailed source tracking
    result = quick_summary(
        query="Artificial intelligence implementation strategies",
        settings_snapshot=settings,
        retrievers={
            "technical": tech_retriever,
            "business": business_retriever,
        },
        search_tool="wikipedia",  # Also use web search
        iterations=2,
        questions_per_iteration=2,
        programmatic_mode=True,
    )

    # Detailed source analysis
    print("\nSource Analysis:")
    sources = result.get("sources", [])

    # Categorize sources
    source_categories = {"technical": [], "business": [], "web": []}

    for source in sources:
        if isinstance(source, dict):
            source_type = source.get("metadata", {}).get("source", "")
            if "tech" in source_type:
                source_categories["technical"].append(source)
            elif "business" in source_type:
                source_categories["business"].append(source)
            else:
                source_categories["web"].append(source)
        else:
            source_categories["web"].append(source)

    # Report on each category
    for category, category_sources in source_categories.items():
        print(f"\n{category.upper()} Sources ({len(category_sources)}):")
        if category_sources:
            for i, source in enumerate(category_sources[:2], 1):  # Show first 2
                if isinstance(source, dict):
                    title = source.get("metadata", {}).get("title", "Untitled")
                    print(f"  {i}. {title}")
                else:
                    print(f"  {i}. {str(source)[:60]}...")

    # Show findings breakdown
    findings = result.get("findings", [])
    print(f"\nTotal findings: {len(findings)}")
    print("Findings provide integrated insights from all source types")

    return result


def demonstrate_meta_search_config():
    """Show how to use meta search configuration for complex setups."""
    print("\n" + "=" * 70)
    print("META SEARCH CONFIGURATION")
    print("=" * 70)
    print("""
Using meta search for sophisticated search strategies:
- Combine multiple search engines
- Configure aggregation and deduplication
- Control search priority and weighting
    """)

    # Create retrievers
    tech_retriever = TechnicalDocsRetriever()

    settings = create_settings_snapshot({})

    # Advanced meta search configuration
    result = quick_summary(
        query="Quantum computing applications",
        settings_snapshot=settings,
        retrievers={
            "tech_docs": tech_retriever,
        },
        search_tool="meta",  # Use meta search
        meta_search_config={
            "retrievers": ["tech_docs"],  # Include custom retriever
            "engines": ["wikipedia", "arxiv"],  # Also search these
            "aggregate": True,  # Combine results
            "deduplicate": True,  # Remove duplicate content
            "max_results_per_engine": 5,  # Limit per source
        },
        iterations=2,
        questions_per_iteration=3,
        programmatic_mode=True,
    )

    print("\nMeta search results:")
    print(f"  - Total sources: {len(result.get('sources', []))}")
    print(f"  - Summary length: {len(result.get('summary', ''))} chars")
    print(f"  - Findings: {len(result.get('findings', []))}")

    print("\nMeta search advantages:")
    print("  - Comprehensive coverage from multiple sources")
    print("  - Automatic deduplication of similar content")
    print("  - Balanced perspective from different source types")

    return result


def main():
    """Run all hybrid search demonstrations."""
    print("=" * 70)
    print("LOCAL DEEP RESEARCH - HYBRID SEARCH DEMONSTRATION")
    print("=" * 70)
    print("""
This example shows how to combine multiple search sources:
- Custom retrievers for proprietary knowledge
- Web search engines for public information
- Meta search for sophisticated strategies
    """)

    # Run demonstrations
    demonstrate_multiple_retrievers()
    demonstrate_retriever_plus_web()
    demonstrate_source_analysis()
    demonstrate_meta_search_config()

    print("\n" + "=" * 70)
    print("KEY TAKEAWAYS")
    print("=" * 70)
    print("""
1. Multiple Retrievers: Use specialized retrievers for different document types
2. Hybrid Search: Combine internal knowledge with web search for comprehensive results
3. Source Analysis: Track and analyze sources to understand information origin
4. Meta Search: Configure complex search strategies with aggregation and deduplication

Best Practices:
- Name your retrievers descriptively for easy tracking
- Balance internal and external sources based on your needs
- Use source analysis to verify information consistency
- Configure meta search for optimal result aggregation
    """)

    print("\n✓ Hybrid search demonstration complete!")


if __name__ == "__main__":
    main()
